set.seed(2)

required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander")
for(i in required_packages) { 
  if(!require(i, character.only = T)) {
    #  if package is not existing, install then load the package
    install.packages(i, dependencies = T)
  require(i, character.only = T)
  }
}

panderOptions('table.alignment.default', "left")

## quality of png's
dpi <- 750

## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))


theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))

## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))

## theme settings for maps
theme_map <- 
  theme_void(base_family = "Montserrat") +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)

## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")

## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")

## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))

## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)
display_number <- function(n, title){
  tibble(!!sym(title) := format(n, big.mark = ",")) %>% pander()
}
con <- dbConnect(
    bigrquery::bigquery(),
    project = "dataforseo-bigquery",
    billing = "dataforseo-bigquery"
)

sql <- glue("SELECT * FROM `dataforseo-bigquery.dataforseo_data.keyword_data` 
          WHERE location = 2840 
          ORDER BY keyword_info_search_volume DESC
          LIMIT 50000")
tb <- bq_project_query("dataforseo-bigquery", sql)
top <- bq_table_download(tb, max_results = 1000000)

Basic stats

SELECT COUNT(keyword_info_search_volume) as `total_count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
total_count <- sql$total_count
display_number(total_count, "Total number of different searches")
Total number of different searches
580,082,623
SELECT SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS total_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
# Calculated in a roundabout way to avoid integer overflow
total_volume <- sql$total_volume * 10000
tibble("Total number of searches" = glue("~{format(round(total_volume / 1000000), big.mark = ',')} million")) %>% 
  pander()
Total number of searches
~302,567 million


This table shows the top 10 searches. They are almost all spelling errors. As in, they are not really searched, but rather people attempting to go to Youtube or Facebook, but typing it wrong. Oddly they are all attributed as having a search volume of exactly 185 million.

top %>% 
  select(keyword, location, spell, spell_type, keyword_info_search_volume) %>% 
  head(10) %>% 
  gt() %>% 
  tab_options(table.align = "left") %>% 
  tab_header("Top searches")
Top searches
keyword location spell spell_type keyword_info_search_volume
face bak 2840 facebook did_you_mean 185000000
yput 2840 youtube did_you_mean 185000000
xyoutube 2840 youtube showing_results_for 185000000
utub 2840 youtube did_you_mean 185000000
yioutub 2840 youtube showing_results_for 185000000
fa9ebook 2840 facebook showing_results_for 185000000
ioyoutube 2840 youtube showing_results_for 185000000
kn youtube 2840 on did_you_mean 185000000
the only youtube 2840 185000000
youtbe == 2840 youtube did_you_mean 185000000
SELECT COUNT(*) as `missing_count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULL
missing_count <- sql$missing_count
tibble("Missing search volume" = scales::percent(missing_count / total_count, accuracy = 0.001)) %>% pander()
Missing search volume
0.514%

The missing have some searches that are likely high volume. Thus they are truly missing, and not just 0s.

SELECT keyword, keyword_info_search_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULL
ORDER BY RAND()
LIMIT 10
sql %>% gt() %>% tab_options(table.align = "left") %>% 
  tab_header("Keywords with missing search volume")
Keywords with missing search volume
keyword keyword_info_search_volume
Crain's HCSC compensation NA
Glycine sweetener NA
South Carolina Republican primary 2020 polls NA
Jo Malone 針管香水 評價 NA
Starting Strength program NA
DeWalt Mini Circular saw NA
Juilliard contact NA
Garden soil delivery NA
Child and Family Center jobs NA
Best entry indicator MT4 NA
SELECT COUNT(*) as `zero_count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume = 0
zero_count <- sql$zero_count
tibble("Searches with search volume 0" = scales::percent(zero_count / total_count, accuracy = 0.001)) %>% pander()
Searches with search volume 0
47.243%
display_number(total_volume / total_count, "Mean search volume")
Mean search volume
521.5929
SELECT approx_quantiles(keyword_info_search_volume, 2)[offset(1)] AS `median`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
display_number(sql$median, "Median search volume")
Median search volume
10
SELECT AVG(`keyword_info_cpc`) AS `mean_cpc`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
display_number(sql$mean_cpc, "Mean CPC")
Mean CPC
0.3202192
SELECT approx_quantiles(keyword_info_cpc, 2)[offset(1)] AS `median_cpc`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
display_number(sql$median_cpc, "Median CPC")
Median CPC
0


Spell types

SELECT *
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND spell_type <> ""
AND keyword_info_search_volume < 10000
LIMIT 1000
SELECT spell_type, SUM(keyword_info_search_volume) / 10000 AS `volume`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
GROUP BY spell_type
spell_types <- sql %>% mutate(spell_type = ifelse(spell_type == "", "no_spell_type", spell_type)) %>% 
  mutate(volume = volume * 10000) 


spell_types %>% ggplot(aes(x = reorder(spell_type, volume), y = volume)) +
  geom_bar(stat = "identity", width = 0.8, fill = "turquoise4", color = "black") +
  labs(x = "", title = "Spell types")

About half of search volume has a spell type. This is especially driven by misspellings of common domains.

spell_type_proportion <- spell_types %>% filter(spell_type != "no_spell_type") %$% 
  sum(volume) / total_volume
tibble("Proportion of searches with spell type" = scales::percent(spell_type_proportion, accuracy = 0.1)) %>% pander()
Proportion of searches with spell type
47.9%


top %>% group_by(spell) %>% 
  summarise(volume = sum(keyword_info_search_volume)) %>% 
  arrange(desc(volume)) %>% 
  filter(spell != "") %>% 
  head(15) %>% 
  gt() %>% 
  tab_options(table.align = "left") %>% 
  tab_header("Top 15 intended searches that are misspelled")
Top 15 intended searches that are misspelled
spell volume
youtube 46879296000
facebook 11555936000
amazon 10041630000
google 8313882000
weather 2955470000
translate 2098786000
com 1994509000
instagram 1783527000
walmart 1731168000
ebay 1566900000
yahoo 1422318000
youtubecom 925000000
netflix 920823000
news 910156000
you 789786000


Questions

question_words <- c("what", "which", "where", "who", "why", "how")
questions <- tribble(~question, ~volume)
for (word in question_words){
  sql <- glue("SELECT sum(keyword_info_search_volume) FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
            WHERE location = 2840 
            AND keyword like '{word} %' ")
  tb <- bq_project_query("dataforseo-bigquery", sql)
  df <- bq_table_download(tb) 
  questions %<>% add_row(question = word, volume = df$f0_)
}

questions %>% mutate(prop = volume / total_volume) %>% 
  ggplot(aes(x = reorder(question, prop), y = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.005), expand = c(0,0)) +
  labs(title = "Questions in searches", x = "", y = "")

SELECT SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS total_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND spell_type = ""
tibble("Total percentage of searches that are questions" = 
         scales::percent(questions %$% sum(volume) / total_volume, accuracy = 0.001)) %>% 
  pander()
Total percentage of searches that are questions
0.806%


total_volume_nospell <- sql$total_volume * 10000

tibble("Total percentage of searches that are questions if spell types are removed" = 
         scales::percent(questions %$% sum(volume) / total_volume_nospell, accuracy = 0.001)) %>% 
  pander()
Total percentage of searches that are questions if spell types are removed
1.547%


Stopwords

stopwords_list <- tibble(stopword = stopwords::stopwords(language = "en")) %>% 
  mutate(stopword = str_remove(stopword, "'")) %>% 
  filter(!(stopword %in% c("shed", "wed", "ill", "hell", "shell")))

get_stopwords_counts <- function(){
stopwords <- tribble(~stopword, ~volume)
  for (word in stopword_list$stopword){
    print(word)
    sql <- glue(
      "SELECT SUM(COALESCE(keyword_info_search_volume / 1000, 0)) AS `total_volume` 
       FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
       WHERE location = 2840 
       AND keyword like '% {word} %' OR keyword like '{word} %' OR keyword like '% {word}'")
    tb <- bq_project_query("dataforseo-bigquery", sql)
  
    df <- bq_table_download(tb) 
    stopwords %<>% add_row(stopword = word, volume = df$total_volume * 1000)
  }
  
  write_csv(stopwords, "../proc_data/stopwords.csv")
}
stopwords <- read_csv("../proc_data/stopwords.csv")

stopwords %>% mutate(prop = volume / total_volume) %>%
  filter(prop > 0.001) %>% 
  ggplot(aes(x = reorder(stopword, prop), y = prop)) +
  geom_bar(stat = "identity", color = "black", fill = "turquoise4", width = 0.7) +
  scale_y_continuous(labels = scales::percent, expand = c(0,0), limits = c(0, 0.05)) +
  labs(x = "", y = "", title = "Searches with specific stopwords") +
  coord_flip()


set.seed(2)
stopwords %>% 
  mutate(stopword = ifelse(stopword == "i", "I", stopword)) %>% 
  arrange(desc(volume)) %>% 
  head(25) %>% 
  ggplot(aes(label = stopword, size = volume, color = factor(sample.int(10, 25, replace = TRUE)))) +
  geom_text_wordcloud(perc_step = 0.5) +
  scale_size_area(max_size = 30) +
  theme_minimal()


Search tails

volume_top <- top %>%  
  add_rownames() %>% 
  mutate(rowname = as.numeric(rowname)) %>% 
  select(rowname, volume = keyword_info_search_volume)

ylab <- c(50, 100, 150, 200)

volume_top %>% 
  mutate(cat = case_when(
    rowname < 500 ~ "Top 500",
    rowname < 1000 ~ "Top 1000",
    rowname < 2000 ~ "Top 2000",
    rowname < 5801 ~ "Top 0.001%",
    T ~ "Remaining 99.99%"
  )) %>% 
  mutate(cat = factor(cat, levels = c("Top 500", "Top 1000", "Top 2000", "Top 0.001%", "Remaining 99.99%"))) %>% 
  head(10500) %>% 
  ggplot(aes(x = rowname, y = volume, fill = cat)) +
  geom_area(alpha = 0.8) +
  scale_y_continuous(
    labels = glue("{ylab} M"),
    breaks = 10^6 * ylab,
    limits = c(0, 200* 10^6), 
    expand = c(0,0)
    ) +
  labs(x = "", title = "Long tail") + 
  geom_segment(aes(x = 8000, y = 35*10^6, xend = 10500, yend = 35*10^6),
               arrow = arrow(length = unit(0.5, "cm"))) +
  scale_x_continuous(expand = c(0,0))


get_count_range <-  function(lower, higher)
{
  sql <- glue(
        "SELECT COUNT(*) AS `count` 
         FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
         WHERE location = 2840 
         AND keyword_info_search_volume >= {lower} 
         AND keyword_info_search_volume <= {higher}")
      tb <- bq_project_query("dataforseo-bigquery", sql)
      bq_table_download(tb)$count
}

df <- tribble(
  ~cat, ~count,
  "0 - 10", get_count_range(0, 10),
  "11- 100", get_count_range(11, 100),
  "101 - 1000", get_count_range(101, 1000),
  "1001 - 10000", get_count_range(1001, 10000),
  "10001 - 100000", get_count_range(10001, "100000"),
  "100001+", get_count_range("100001", "100000000000")) 


df %>% 
  mutate(count = count / sum(count)) %>% 
  ggplot(aes(x = reorder(cat, desc(count)), y = count)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
  theme_light() +
  labs(x = "", y = "", title = "Long tail") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0,0))


write_length_volume <- function()
{
  get_length_volume <-  function(l)
  {
    sql <- glue(
          "SELECT sum(keyword_info_search_volume) / 10000 as `volume`, count(keyword_info_search_volume) as `count`
           FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
           WHERE location = 2840
           AND keyword_info_search_volume IS NOT NULL
           AND LENGTH(keyword) = {l}")
        tb <- bq_project_query("dataforseo-bigquery", sql)
        bq_table_download(tb) %>% mutate(length = l)
  }
  
  df <- map_df(1:50, get_length_volume)  
  write_csv(df, "../proc_data/keyword_length_volume.csv")
}
df <- read_csv("../proc_data/keyword_length_volume.csv")

df %>% mutate(
  volume = 10000 * volume,
  ratio = volume / count
) %>% 
  ggplot(aes(x = length, y = ratio)) +
  geom_line(color = "turquoise4", size = 1) + geom_point(color = "turquoise4") +
  theme_light() +
  labs(x = "", y = "", title = "Average search volume by keyword length")


Keyword_info categories

pservices <- read_csv("../raw_data/productsservices.csv") %>% 
  clean_names() %>% rename(c1 = criterion_id) %>% select(-category) %>% 
  separate(c1, sep =",\"", into = c("id", "category")) %>% 
  mutate(category = substr(category, 2, nchar(category) -1)) %>% 
  separate(category, sep = "/", into = c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8"))

toplevel <- pservices %>% filter(is.na(cat2))
write_categories <- function()
{
  get_category_volume <-  function(id){
    sql <- glue(
      "SELECT SUM(keyword_info_search_volume) / 10000 AS `search_volume`, AVG(keyword_info_cpc) AS `cpc`, COUNT(*) AS `count`
       FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
       WHERE location = 2840 
       AND keyword_info_categories like '%{id}%' ")
    tb <- bq_project_query("dataforseo-bigquery", sql)
    bq_table_download(tb) %>% mutate(id = id)
  }

  df <- map_df(toplevel$id, get_category_volume)  
  
  df %>% mutate(search_volume = search_volume * 10000,
                mean_volume = search_volume / count
                ) %>% 
    left_join(toplevel %>% select(id, cat1), by = "id") %>% 
    write_csv("../proc_data/categories_averages.csv")
}

df <- read_csv("../proc_data/categories_averages.csv")
df %>% 
  ggplot(aes(x = fct_rev(cat1), y = mean_volume)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  theme_light() +
  coord_flip() +
  scale_y_continuous(limits = c(0, 9000), expand = c(0,0)) +
  labs(x = "", y = "", title = "Search volume mean by category")


df %>% 
  ggplot(aes(x = fct_rev(cat1), y = cpc)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  theme_light() +
  coord_flip() +
  scale_y_continuous(limits = c(0, 2), expand = c(0,0)) +
  labs(x = "", y = "", title = "CPC mean by category")